package com.rock.crawl.actor

import akka.actor.Actor
import com.rock.crawl.message.CrawlTask
import scala.collection.mutable.ArrayBuffer
import com.rock.crawl.message.CrawlLinks
import org.jsoup.Jsoup
import com.rock.crawl.store.FileStore
import org.jsoup.select.Elements
import java.net.URL
import scalaj.http.Http
import scalaj.http.HttpOptions

class CrawlWorkerActor extends Actor {
  def receive = {
    case CrawlTask(url: String, processor: ((String, Array[Byte]) => Unit)) => {
      println("fetching " + url)
      fetchPage(url, processor)
    }
  }

  def fetchPage(url: String, processor: (String, Array[Byte]) => Unit) {
    val _1_SECONDS = 1000
    val _1_MINUTES = _1_SECONDS * 60
    
    val req = Http(url).option(HttpOptions.connTimeout(_1_SECONDS*30)).option(HttpOptions.readTimeout(_1_MINUTES*5))
    val status = req.responseCode
    println("status:"+status+"  "+url)
    
    val data = req.asBytes
    processor(url, data)

    try {
      val doc = Jsoup.parse(new String(data))
      val links = doc.select("a[href]");
      val media = doc.select("[src]");
      val imports = doc.select("link[href]");

      sender ! CrawlLinks(getUrls(url, links, "href") ++ getUrls(url, media, "src") ++ getUrls(url, imports, "href"))

    } catch {
      case t: Exception => t.printStackTrace()
    }
  }

  private def getUrls(parentUrl: String, elements: Elements, attrName: String) = {
    var urls = ArrayBuffer.empty[String]
    val size = elements.size()
    for (i <- 0 to (size - 1)) {
      val e = elements.get(i)
      val url = e.attr(attrName)
      if (isValidUrl(url))
        urls += makeFullUrl(parentUrl, url)
    }
    urls.toSet.toSeq
  }

  private def isValidUrl(url: String) = {
    url != null && !url.trim().isEmpty() && !url.trim().equals("#")
  }

  private def makeFullUrl(parent: String, url: String) = {
    url match {
      case u: String if (url.startsWith("http://") || url.startsWith("https://")) => url
      case u: String => {
        val parentUri = new URL(parent)
        val host = parentUri.getHost()
        val prefix = parentUri.getProtocol() + "://" + host
        val fullUrl = if (url.startsWith("/")) prefix + url else prefix + "/" + url
        fullUrl
      }
    }
  }
}